import pandas as pd
import numpy as np
import datetime
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
#import plotly.figure_factory as ff
from warnings import filterwarnings
pd.options.display.max_columns = 500
#!pip install plotly_express
import seaborn as sns
import plotly_express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import psycopg2
engine = psycopg2.connect(
database="final_db",
user="juan",
password="1234",
host="nps-demo-instance.c2fezqs1nmx5.us-east-2.rds.amazonaws.com",
port='5432'
)
# modelo con 3 millones de registros
cursor=engine.cursor()
import pandas as pd
sql = """
SELECT *
FROM fix_secopi
"""
df=pd.read_sql(sql, con=engine)
El objetivo del notebook es desarrollar modelos con la información actualmente disponible en la bases de datos fix_secopi. Esta base de datos contiene las siguientes modificaciones respecto a la base de datos del secop:
1- Se incluyo la columna "SANCIÓN", la cual incluye un identificación para los contratos que han sido reportados como contratos con eventos de corrupción.
2- Se incluyo la columna "Terminado" indica que un contrato ha sido entregado o finalizado de forma adecuada. Los contratos que presentan sanción no puede ser calificado como terminado
df["Log_cuantia_contrato"] =[np.log(x) if x !=0 else 0 for x in df["cuantia_contrato"]]
df["Log_valor_contrato_con_adiciones"] =[np.log(x) if x !=0 else 0 for x in df["valor_contrato_con_adiciones"]]
df["Log_valor_total_de_adiciones"] =[np.log(x) if x !=0 else 0 for x in df["valor_total_de_adiciones"]]
df["terminado"]=[int(x) for x in df["terminado"]]
df["sancion"]=[int(x) for x in df["sancion"]]
colInteres=['identificacion_del_contratista', 'nombre_de_la_entidad','nit_de_la_entidad', 'nivel_entidad', 'orden_entidad',
'municipio_obtencion', 'municipio_entrega','nom_raz_social_contratista', 'moneda','cuantia_contrato',
'valor_contrato_con_adiciones','valor_total_de_adiciones',"Log_cuantia_contrato","Log_valor_contrato_con_adiciones",
"Log_valor_total_de_adiciones",'sancion', 'anno_firma_del_contrato','estado_del_proceso', 'fecha_de_firma_del_contrato',
'fecha_ini_ejec_contrato', 'fecha_fin_ejec_contrato','municipios_ejecucion', 'departamento_ejecucion', 'latitud',
'longitud','terminado']
Entidades con mayor cantidad de contratos, mayor cuantia del contrato, mayor cantidad de adiciones, mayor cantidad de sanciones, mayor cantidad de contratos terminados
aggfun_empresa = {"cuantia_contrato":["sum","max","count"],
"sancion":"sum",
"terminado":"sum"}
df_GB=df[colInteres].groupby("nombre_de_la_entidad").agg(aggfun_empresa).reset_index()
df_GB.columns = ["_".join(x) for x in df_GB.columns.ravel()]
df_GB.sort_values(by=("cuantia_contrato_sum"),ascending=False)[:15]
Entidades con la mayor cuantía contratada
px.bar(df_GB.sort_values(by=("cuantia_contrato_sum"), ascending=False)[:15].sort_values(by=("cuantia_contrato_sum"), ascending=True),x="cuantia_contrato_sum",y="nombre_de_la_entidad_",orientation='h')
Entidades del estado con mayor cantidad de contratos asignados
px.bar(df_GB.sort_values(by=("cuantia_contrato_count"), ascending=False)[:15].sort_values(by=("cuantia_contrato_count"), ascending=True),x="cuantia_contrato_count",y="nombre_de_la_entidad_",orientation='h')
Entidades del estado con mayor cantidad de contratos terminados
px.bar(df_GB.sort_values(by=("terminado_sum"), ascending=False)[:15].sort_values(by=("terminado_sum"), ascending=True),x="terminado_sum",y="nombre_de_la_entidad_",orientation='h')
px.bar(df_GB.sort_values(by=("sancion_sum"), ascending=False)[:15].sort_values(by=("sancion_sum"), ascending=True),x="sancion_sum",y="nombre_de_la_entidad_",orientation='h')
aggfun_empresa = {"cuantia_contrato":["sum","max","count"],
"sancion":"sum",
"terminado":"sum"}
df_GBContratista=df[colInteres].groupby("nom_raz_social_contratista").agg(aggfun_empresa).reset_index()
df_GBContratista.columns = ["_".join(x) for x in df_GBContratista.columns.ravel()]
Empresas con la mayor cuantía contratada
px.bar(df_GBContratista.sort_values(by=("cuantia_contrato_sum"), ascending=False)[:15].sort_values(by=("cuantia_contrato_sum"), ascending=True),x="cuantia_contrato_sum",y="nom_raz_social_contratista_",orientation='h')
Empresas con mayor cantidad de contratos asignados
px.bar(df_GBContratista.sort_values(by=("cuantia_contrato_count"), ascending=False)[:15].sort_values(by=("cuantia_contrato_count"), ascending=True),x="cuantia_contrato_count",y="nom_raz_social_contratista_",orientation='h')
Empresas con mayor cantidad de contratos terminados
px.bar(df_GBContratista.sort_values(by=("terminado_sum"), ascending=False)[:15].sort_values(by=("terminado_sum"), ascending=True),x="terminado_sum",y="nom_raz_social_contratista_",orientation='h')
Compañias con mayor cantidad de contratos sancionados
px.bar(df_GBContratista.sort_values(by=("sancion_sum"), ascending=False)[:15].sort_values(by=("sancion_sum"), ascending=True),x="sancion_sum",y="nom_raz_social_contratista_",orientation='h')
#!pip install wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
WordCloud con el df agrupado por razón social de contratista
Contratistas = " ".join(str(contrato) for contrato in df_GBContratista["nom_raz_social_contratista_"])
stopwords = set(STOPWORDS)
stopwords.update(["DE", "DEL", "SA","SAS", "MUNICIPIO"])
wordcloud = WordCloud(stopwords=stopwords,max_font_size=50, max_words=100, background_color="white").generate(Contratistas)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.figure(figsize=[20,20])
plt.show()
WordCloud con el df agrupado por razón social de la entidad estatal
Contratistas = " ".join(str(contrato) for contrato in df_GB["nombre_de_la_entidad_"])
stopwords = set(STOPWORDS)
stopwords.update(["DE", "DEL", "SA","SAS", "MUNICIPIO"])
wordcloud = WordCloud(stopwords=stopwords,max_font_size=50, max_words=100, background_color="white").generate(Contratistas)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.figure(figsize=[25,25])
plt.show()
#NetworkAnalysis
import networkx as nx
df_sancion=df[df.sancion==1]
filtrar_datos=df_sancion.groupby(["nom_raz_social_contratista"]).agg("count")
filtrar_datos=filtrar_datos[filtrar_datos.identificacion_del_contratista >2].reset_index()["nom_raz_social_contratista"]
df_sancion_filtrados=df_sancion[df_sancion.nom_raz_social_contratista.isin(filtrar_datos)]
df_sancion_filtrados.shape
A = list(df_sancion_filtrados["nom_raz_social_contratista"].unique())
B = list(df_sancion_filtrados["nombre_de_la_entidad"].unique())
node_list = list(set(A+B))
G = nx.Graph()
for i in node_list:
G.add_node(i)
for i,j in df_sancion_filtrados.iterrows():
G.add_edges_from([(j["nom_raz_social_contratista"],j["nombre_de_la_entidad"])])
pos = nx.spring_layout(G, k=0.5, iterations=50)
for n, p in pos.items():
G.nodes[n]['pos'] = p
import plotly.graph_objs as go
from networkx.drawing.nx_agraph import graphviz_layout
edge_trace = go.Scatter(
x=[],
y=[],
line=dict(width=0.5,color='#888'),
hoverinfo='none',
mode='lines')
for edge in G.edges():
x0, y0 = G.nodes[edge[0]]['pos']
x1, y1 = G.nodes[edge[1]]['pos']
edge_trace['x'] += tuple([x0, x1, None])
edge_trace['y'] += tuple([y0, y1, None])
node_trace = go.Scatter(
x=[],
y=[],
text=[],
mode='markers',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='RdBu',
reversescale=True,
color=[],
size=15,
colorbar=dict(
thickness=10,
title='Node Connections',
xanchor='left',
titleside='right'
),
line=dict(width=0)))
for node in G.nodes():
x, y = G.nodes[node]['pos']
node_trace['x'] += tuple([x])
node_trace['y'] += tuple([y])
for node, adjacencies in enumerate(G.adjacency()):
node_trace['marker']['color']+=tuple([len(adjacencies[1])])
node_info = adjacencies[0] +' # of connections: '+str(len(adjacencies[1]))
node_trace['text']+=tuple([node_info])
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='Red de contratación',
titlefont=dict(size=16),
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="No. de contratos sancionados",
showarrow=False,
xref="paper", yref="paper") ],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
iplot(fig)
# Modelamiento de solo interacción de compañias con
A = list(df_sancion_filtrados["nom_raz_social_contratista"].unique())
B = list(df_sancion_filtrados["nombre_de_la_entidad"].unique())
node_list = list(set(A+B))
G = nx.Graph()
for i in node_list:
G.add_node(i)
for i,j in df_sancion_filtrados.iterrows():
G.add_edges_from([(j["nom_raz_social_contratista"],j["nombre_de_la_entidad"])])
pos = nx.spring_layout(G, k=0.5, iterations=50)
for n, p in pos.items():
G.nodes[n]['pos'] = p
import plotly.graph_objs as go
from networkx.drawing.nx_agraph import graphviz_layout
edge_trace = go.Scatter(
x=[],
y=[],
line=dict(width=0.5,color='#888'),
hoverinfo='none',
mode='lines')
for edge in G.edges():
x0, y0 = G.nodes[edge[0]]['pos']
x1, y1 = G.nodes[edge[1]]['pos']
edge_trace['x'] += tuple([x0, x1, None])
edge_trace['y'] += tuple([y0, y1, None])
node_trace = go.Scatter(
x=[],
y=[],
text=[],
mode='markers',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='RdBu',
reversescale=True,
color=[],
size=15,
colorbar=dict(
thickness=10,
title='Node Connections',
xanchor='left',
titleside='right'
),
line=dict(width=0)))
for node in G.nodes():
x, y = G.nodes[node]['pos']
node_trace['x'] += tuple([x])
node_trace['y'] += tuple([y])
for node, adjacencies in enumerate(G.adjacency()):
node_trace['marker']['color']+=tuple([len(adjacencies[1])])
node_info = adjacencies[0] +' # of connections: '+str(len(adjacencies[1]))
node_trace['text']+=tuple([node_info])
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='Red de contratación',
titlefont=dict(size=16),
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="No. de contratos sancionados",
showarrow=False,
xref="paper", yref="paper") ],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
iplot(fig)
df_sancion_entidadestop=df_sancion.groupby(["nombre_de_la_entidad"]).agg({"sancion":"count"}).sort_values("sancion",ascending=False)[:10].reset_index()["nombre_de_la_entidad"]
df_sancion_entidadestop
df_sancion_top = df_sancion[df_sancion["nombre_de_la_entidad"].isin(df_sancion_entidadestop[0:5])]
A = list(df_sancion_top["nom_raz_social_contratista"].unique())
B = list(df_sancion_top["nombre_de_la_entidad"].unique())
node_list = list(set(A+B))
G = nx.Graph()
for i in node_list:
G.add_node(i)
for i,j in df_sancion_top.iterrows():
G.add_edges_from([(j["nom_raz_social_contratista"],j["nombre_de_la_entidad"])])
pos = nx.spring_layout(G, k=0.5, iterations=50)
for n, p in pos.items():
G.nodes[n]['pos'] = p
edge_trace = go.Scatter(
x=[],
y=[],
line=dict(width=0.5,color='#888'),
hoverinfo='none',
mode='lines')
for edge in G.edges():
x0, y0 = G.nodes[edge[0]]['pos']
x1, y1 = G.nodes[edge[1]]['pos']
edge_trace['x'] += tuple([x0, x1, None])
edge_trace['y'] += tuple([y0, y1, None])
node_trace = go.Scatter(
x=[],
y=[],
text=[],
mode='markers',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='RdBu',
reversescale=True,
color=[],
size=15,
colorbar=dict(
thickness=10,
title='Node Connections',
xanchor='left',
titleside='right'
),
line=dict(width=0)))
for node in G.nodes():
x, y = G.nodes[node]['pos']
node_trace['x'] += tuple([x])
node_trace['y'] += tuple([y])
for node, adjacencies in enumerate(G.adjacency()):
#if len(adjacencies[1])>1:
node_trace['marker']['color']+=tuple([len(adjacencies[1])])
node_info =adjacencies[0] +' # of connections: '+str(len(adjacencies[1]))
node_trace['text']+=tuple([node_info])
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='Red de contratación',
titlefont=dict(size=16),
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="No. de contratos sancionados",
showarrow=False,
xref="paper", yref="paper") ],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
iplot(fig)
df_sancion_entidadestop=df_sancion.groupby(["nombre_de_la_entidad"]).agg({"sancion":"count"}).sort_values("sancion",ascending=False)[:10].reset_index()["nombre_de_la_entidad"]
df_sancion_entidadestop
df_sancion_top = df_sancion[df_sancion["nombre_de_la_entidad"].isin([df_sancion_entidadestop[0]])]
A = list(df_sancion_top["nom_raz_social_contratista"].unique())
B = list(df_sancion_top["nombre_de_la_entidad"].unique())
node_list = list(set(A+B))
G = nx.Graph()
for i in node_list:
G.add_node(i)
for i,j in df_sancion_top.iterrows():
G.add_edges_from([(j["nom_raz_social_contratista"],j["nombre_de_la_entidad"])])
pos = nx.spring_layout(G, k=0.5, iterations=50)
for n, p in pos.items():
G.nodes[n]['pos'] = p
edge_trace = go.Scatter(
x=[],
y=[],
line=dict(width=0.5,color='#888'),
hoverinfo='none',
mode='lines')
for edge in G.edges():
x0, y0 = G.nodes[edge[0]]['pos']
x1, y1 = G.nodes[edge[1]]['pos']
edge_trace['x'] += tuple([x0, x1, None])
edge_trace['y'] += tuple([y0, y1, None])
node_trace = go.Scatter(
x=[],
y=[],
text=[],
mode='markers',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='RdBu',
reversescale=True,
color=[],
size=15,
colorbar=dict(
thickness=10,
title='Node Connections',
xanchor='left',
titleside='right'
),
line=dict(width=0)))
for node in G.nodes():
x, y = G.nodes[node]['pos']
node_trace['x'] += tuple([x])
node_trace['y'] += tuple([y])
for node, adjacencies in enumerate(G.adjacency()):
node_trace['marker']['color']+=tuple([len(adjacencies[1])])
node_info =adjacencies[0] +' # of connections: '+str(len(adjacencies[1]))
node_trace['text']+=tuple([node_info])
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='Red de contratación',
titlefont=dict(size=16),
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="No. de contratos sancionados",
showarrow=False,
xref="paper", yref="paper") ],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
iplot(fig)
df_sancion_entidadestop=df_sancion.groupby(["nom_raz_social_contratista"]).agg({"sancion":"count"}).sort_values("sancion",ascending=False)[:10].reset_index()["nom_raz_social_contratista"]
df_sancion_entidadestop
df_sancion_top = df_sancion[df_sancion["nom_raz_social_contratista"].isin(df_sancion_entidadestop[0:5])]
A = list(df_sancion_top["nom_raz_social_contratista"].unique())
B = list(df_sancion_top["nombre_de_la_entidad"].unique())
node_list = list(set(A+B))
G = nx.Graph()
for i in node_list:
G.add_node(i)
for i,j in df_sancion_top.iterrows():
G.add_edges_from([(j["nom_raz_social_contratista"],j["nombre_de_la_entidad"])])
pos = nx.spring_layout(G, k=0.5, iterations=50)
for n, p in pos.items():
G.nodes[n]['pos'] = p
edge_trace = go.Scatter(
x=[],
y=[],
line=dict(width=0.5,color='#888'),
hoverinfo='none',
mode='lines')
for edge in G.edges():
x0, y0 = G.nodes[edge[0]]['pos']
x1, y1 = G.nodes[edge[1]]['pos']
edge_trace['x'] += tuple([x0, x1, None])
edge_trace['y'] += tuple([y0, y1, None])
node_trace = go.Scatter(
x=[],
y=[],
text=[],
mode='markers',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='RdBu',
reversescale=True,
color=[],
size=15,
colorbar=dict(
thickness=10,
title='Node Connections',
xanchor='left',
titleside='right'
),
line=dict(width=0)))
for node in G.nodes():
x, y = G.nodes[node]['pos']
node_trace['x'] += tuple([x])
node_trace['y'] += tuple([y])
for node, adjacencies in enumerate(G.adjacency()):
node_trace['marker']['color']+=tuple([len(adjacencies[1])])
node_info =adjacencies[0] +' # of connections: '+str(len(adjacencies[1]))
node_trace['text']+=tuple([node_info])
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='Red de contratación',
titlefont=dict(size=16),
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="No. de contratos sancionados",
showarrow=False,
xref="paper", yref="paper") ],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
iplot(fig)